For the reader: You can click the black “code” box on the right hand side to either see or hide the R code behind analysis results and graphs.
The results are arranged to separate tabs by site/area, so that it’s easy to look at just the results of the site of interest, instead of scrolling accidentally down to a different site and getting confused.
The results for each site consist of four parts: pXRF, particle size distribution, loss on ignition, and colorimetry.
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
We end up with 20 elements in the “pxrf_all” dataset, and 12 elements in the “pxrf_no_na” dataset.
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read the raw data
pxrf <- read.xlsx("data/pxrf_AY.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2",
"S", "Cl", "K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
glimpse(pxrf_no_na)
## Rows: 47
## Columns: 14
## $ Area <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2 <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ K2O <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ Mn <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Cu <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ Rb <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Zr <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~
pxrf_all:
glimpse(pxrf_all)
## Rows: 47
## Columns: 22
## $ Area <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ MgO <dbl> -0.51417215, 0.64100358, 0.22606806, -0.12125145, 0.61130727, -0~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2 <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ S <dbl> -0.2323566, -0.2339645, NA, NA, -0.1447233, NA, NA, NA, NA, NA, ~
## $ Cl <dbl> -0.45906368, 0.66024144, 0.47946871, -0.29005356, 0.12535227, -0~
## $ K2O <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ V <dbl> NA, NA, -0.009516182, NA, NA, 0.216968943, NA, NA, NA, 0.6861167~
## $ Mn <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Ni <dbl> 1.17211765, -0.73894374, -0.73894374, 0.14833477, -0.92094959, 0~
## $ Cu <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ As <dbl> NA, NA, -0.0963078, NA, -0.8953058, -0.2960573, NA, 2.3006863, N~
## $ Rb <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Y <dbl> NA, -0.85601627, -0.05383750, 0.28040365, -0.41036140, 0.3026864~
## $ Zr <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~
## $ Nb <dbl> NA, NA, -0.4829764, NA, -0.8931231, NA, -0.4829764, NA, 0.952536~
Introduction
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Visualize correlations between elements
cor(pxrf_1[3:12]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements
# Elements with no NA values at all
colnames(pxrf_no_na)
## [1] "Area" "Type" "Al2O3" "SiO2" "K2O" "CaO" "Ti" "Mn" "Fe"
## [10] "Cu" "Zn" "Rb" "Sr" "Zr"
# PCA
pca_1 <- prcomp(pxrf_1[3:12])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9037 1.0026 0.8270 0.67213 0.37446 0.35729 0.29419
## Proportion of Variance 0.5809 0.1611 0.1096 0.07242 0.02248 0.02046 0.01387
## Cumulative Proportion 0.5809 0.7421 0.8517 0.92413 0.94661 0.96707 0.98095
## PC8 PC9 PC10
## Standard deviation 0.26346 0.16502 0.14906
## Proportion of Variance 0.01113 0.00437 0.00356
## Cumulative Proportion 0.99207 0.99644 1.00000
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by area")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by sample type")
PCA with all the feasible elements
# Elements that have at least one viable value
colnames(pxrf_all)
## [1] "Area" "Type" "MgO" "Al2O3" "SiO2" "S" "Cl" "K2O" "CaO"
## [10] "Ti" "V" "Mn" "Fe" "Ni" "Cu" "Zn" "As" "Rb"
## [19] "Sr" "Y" "Zr" "Nb"
# PCA with NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:22])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1392 1.2623 1.1731 0.77694 0.75938 0.56778 0.50254
## Proportion of Variance 0.4455 0.1551 0.1340 0.05877 0.05614 0.03138 0.02459
## Cumulative Proportion 0.4455 0.6006 0.7346 0.79337 0.84951 0.88090 0.90548
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.44075 0.42941 0.3738 0.33174 0.29859 0.26048 0.24174
## Proportion of Variance 0.01891 0.01795 0.0136 0.01071 0.00868 0.00661 0.00569
## Cumulative Proportion 0.92439 0.94235 0.9559 0.96666 0.97534 0.98195 0.98764
## PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 0.22407 0.16224 0.16055 0.12292 0.08709 0.04448
## Proportion of Variance 0.00489 0.00256 0.00251 0.00147 0.00074 0.00019
## Cumulative Proportion 0.99253 0.99509 0.99760 0.99907 0.99981 1.00000
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam more elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam more elements grouped by area")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam more elements grouped by sample type")
Introduction
# HCA
# compute divisive hierarchical clustering
hc4 <- diana(pxrf_1[3:12])
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8007861
# Diana dendrogram
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")
# HCA dendrogam 1
dend <-
pxrf_1[3:12] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=5) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=5) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward5 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AY.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/grain/AY_grain2.xlsx")
# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/grain/AY_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=2)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Preparing data
loi <- read.xlsx("data/loi_AY.xlsx", sep=";")
tga <- read.xlsx("data/tga_AY.xlsx", sep=";")
summary(loi)
## sample crucible wet_weight sample_wet
## Length:52 Min. :8.079 Min. :10.13 Min. :1.538
## Class :character 1st Qu.:8.566 1st Qu.:11.00 1st Qu.:2.464
## Mode :character Median :9.006 Median :11.71 Median :2.635
## Mean :8.994 Mean :11.55 Mean :2.559
## 3rd Qu.:9.435 3rd Qu.:12.05 3rd Qu.:2.824
## Max. :9.925 Max. :12.68 Max. :3.043
## dry_weight after_550_C after_950_C context
## Min. :10.11 Min. :10.08 Min. :10.05 Length:52
## 1st Qu.:10.97 1st Qu.:10.94 1st Qu.:10.86 Class :character
## Median :11.67 Median :11.62 Median :11.53 Mode :character
## Mean :11.52 Mean :11.47 Mean :11.41
## 3rd Qu.:12.00 3rd Qu.:11.96 3rd Qu.:11.93
## Max. :12.65 Max. :12.60 Max. :12.52
## type
## Length:52
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:53 Length:53 Length:53 Length:53
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Dry Mass_550 Mass_950
## Length:53 Length:53 Length:53
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
Manual LOI results
# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)
# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum.
boxplot.stats(loi$c950)
## $stats
## [1] 0.398150 1.528018 2.198513 3.162314 4.889189
##
## $n
## [1] 52
##
## $conf
## [1] 1.840428 2.556598
##
## $out
## [1] 8.041755 11.960444
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)
# Removing duplicates from the next graph
rownames(loi)
## [1] "AY-1" "AY-2" "AY-3" "AY-4" "AY-5" "AY-6" "AY-7"
## [8] "AY-8" "AY-9" "AY-10" "AY-11" "AY-12" "AY-13" "AY-14"
## [15] "AY-15" "AY-16" "AY-17" "AY-18" "AY-19" "AY-20" "AY-21"
## [22] "AY-22" "AY-23" "AY-24" "AY-25" "AY-26" "AY-27" "AY-28"
## [29] "AY-29" "AY-30" "AY-31" "AY-32" "AY-33" "AY-34" "AY-35"
## [36] "AY-36" "AY-37" "AY-38" "AY-39" "AY-40" "AY-41" "AY-42"
## [43] "AY-50" "AY-51" "AY-52" "AY-53" "AY-54" "AY-50_2" "AY-51_2"
## [50] "AY-52_2" "AY-53_2" "AY-54_2"
loi <- subset(loi[1:47, ])
ggplot(loi,
aes(c550, c950, label = rownames(loi), colour = factor(context))) +
geom_point(size=2, aes(shape = factor(type))) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
ggplot(loi,
aes(c550, c950)) +
geom_point(size=2, aes(shape = factor(type), colour = factor(context))) +
labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
barplot(loi$c550)
barplot(loi$c950)
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
TGA LOI results
# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)
# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum.
boxplot.stats(tga$c950)
## $stats
## [1] 0.7233976 1.5855926 2.3958197 3.4621816 5.4590326
##
## $n
## [1] 50
##
## $conf
## [1] 1.976504 2.815135
##
## $out
## [1] 7.994718
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])
ggplot(tga,
aes(c550, c950, label = rownames(tga))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
barplot(tga$c550)
barplot(tga$c950)
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_AY.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :37.56 Min. : 7.97 Min. :16.81
## 1st Qu.:42.13 1st Qu.: 9.72 1st Qu.:19.76
## Median :43.64 Median :10.38 Median :21.18
## Mean :43.92 Mean :10.49 Mean :21.22
## 3rd Qu.:45.91 3rd Qu.:11.29 3rd Qu.:22.34
## Max. :48.24 Max. :13.53 Max. :25.03
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("b")
# Simple 3D plot
plot3d(color$L, color$a, color$b)
# Colorful 3D plot with drop lines
scatter3D(color$L, color$a, color$b,
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 0.5)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.5)
Intro about the site
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Data preparation
## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AYB.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/granulometry/AY_grain2.xlsx")
# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/granulometry/AY_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Intro about the site
Introduction
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read data
nn <- read.xlsx("data/pxrf_israel.xlsx", sep=";")
nn <- as.data.frame(nn)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Data preparation
# Read and filter data
particle <- read.xlsx("data/grain_israel.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "israel_grain2"
## write.xlsx(particle, file="data/granulometry/israel_grain2.xlsx")
# Manually added Type and Context columns to israel_grain2 data: "israel_grain3"
grain3 <- read.xlsx("data/granulometry/israel_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Data preparation
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
Data preparation
Intro about the site
Introduction
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(FactoMineR); # fast PCA graphs
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(tidyverse) # rectangles around HClust
Data preparation
# Read data
pxrf <- read.xlsx("data/pxrf_PP.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2",
"S", "Cl", "K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(pxrf_NA, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
# Datasets with all the samples, including soil and old mudbrick samples
summary(pxrf_no_na)
## Area Type Ti Mn Fe
## new :11 mudbrick:45 Min. :0.0969 Min. :0.01190 Min. :0.9617
## old :34 soil : 8 1st Qu.:0.1814 1st Qu.:0.03893 1st Qu.:2.2241
## soil: 8 Median :0.2118 Median :0.04670 Median :2.7214
## Mean :0.2180 Mean :0.04549 Mean :2.8091
## 3rd Qu.:0.2594 3rd Qu.:0.05440 3rd Qu.:3.3530
## Max. :0.3571 Max. :0.06800 Max. :4.7754
## Ni Cu Zr
## Min. :0.000400 Min. :0.003600 Min. :0.004300
## 1st Qu.:0.002200 1st Qu.:0.005933 1st Qu.:0.007800
## Median :0.003000 Median :0.006500 Median :0.008800
## Mean :0.002856 Mean :0.006761 Mean :0.008810
## 3rd Qu.:0.003500 3rd Qu.:0.007400 3rd Qu.:0.009667
## Max. :0.005900 Max. :0.011333 Max. :0.012200
summary(pxrf_all)
## Area Type MgO Al2O3 SiO2
## new :11 mudbrick:45 Min. :2.919 Min. :2.182 Min. : 9.164
## old :34 soil : 8 1st Qu.:3.369 1st Qu.:3.403 1st Qu.:14.598
## soil: 8 Median :4.088 Median :4.317 Median :17.087
## Mean :4.011 Mean :4.168 Mean :16.502
## 3rd Qu.:4.499 3rd Qu.:4.702 3rd Qu.:18.560
## Max. :5.125 Max. :5.902 Max. :22.259
## NA's :34 NA's :34 NA's :34
## S Cl K2O CaO
## Min. :0.04373 Min. :0.01113 Min. :0.1980 Min. :11.28
## 1st Qu.:0.10686 1st Qu.:0.02597 1st Qu.:0.3599 1st Qu.:14.53
## Median :0.21532 Median :0.05117 Median :0.4238 Median :18.05
## Mean :0.25538 Mean :0.13135 Mean :0.4340 Mean :17.94
## 3rd Qu.:0.32239 3rd Qu.:0.10892 3rd Qu.:0.4795 3rd Qu.:20.82
## Max. :0.74597 Max. :0.93440 Max. :0.6584 Max. :25.59
## NA's :35 NA's :34 NA's :36 NA's :34
## Ti V Mn Fe
## Min. :0.0969 Min. :0.00267 Min. :0.01190 Min. :0.9617
## 1st Qu.:0.1814 1st Qu.:0.00334 1st Qu.:0.03893 1st Qu.:2.2241
## Median :0.2118 Median :0.00420 Median :0.04670 Median :2.7214
## Mean :0.2180 Mean :0.00425 Mean :0.04549 Mean :2.8091
## 3rd Qu.:0.2594 3rd Qu.:0.00512 3rd Qu.:0.05440 3rd Qu.:3.3530
## Max. :0.3571 Max. :0.00587 Max. :0.06800 Max. :4.7754
## NA's :41
## Ni Cu Zn As
## Min. :0.000400 Min. :0.003600 Min. :0.00167 Min. :0.00033
## 1st Qu.:0.002200 1st Qu.:0.005933 1st Qu.:0.00308 1st Qu.:0.00033
## Median :0.003000 Median :0.006500 Median :0.00337 Median :0.00040
## Mean :0.002856 Mean :0.006761 Mean :0.00335 Mean :0.00040
## 3rd Qu.:0.003500 3rd Qu.:0.007400 3rd Qu.:0.00382 3rd Qu.:0.00047
## Max. :0.005900 Max. :0.011333 Max. :0.00500 Max. :0.00047
## NA's :34 NA's :48
## Rb Sr Y Zr
## Min. :0.00120 Min. :0.01790 Min. :0.00087 Min. :0.004300
## 1st Qu.:0.00173 1st Qu.:0.02893 1st Qu.:0.00137 1st Qu.:0.007800
## Median :0.00203 Median :0.03403 Median :0.00160 Median :0.008800
## Mean :0.00204 Mean :0.03402 Mean :0.00161 Mean :0.008810
## 3rd Qu.:0.00213 3rd Qu.:0.03835 3rd Qu.:0.00173 3rd Qu.:0.009667
## Max. :0.00357 Max. :0.04840 Max. :0.00260 Max. :0.012200
## NA's :35 NA's :34 NA's :36
## Nb Rh Ag Pb
## Min. :0.00063 Min. :0.01570 Min. :0.00230 Min. :0.002
## 1st Qu.:0.00063 1st Qu.:0.01683 1st Qu.:0.00282 1st Qu.:0.002
## Median :0.00063 Median :0.01797 Median :0.00322 Median :0.002
## Mean :0.00063 Mean :0.01797 Mean :0.00330 Mean :0.002
## 3rd Qu.:0.00063 3rd Qu.:0.01910 3rd Qu.:0.00369 3rd Qu.:0.002
## Max. :0.00063 Max. :0.02023 Max. :0.00447 Max. :0.002
## NA's :52 NA's :51 NA's :49 NA's :52
# Datasets with only our new mudbrick samples (no soil or previously published ones)
MB_nona <- pxrf_no_na[c(1:11), ]
MB_all <- pxrf_all[c(1:11), ]
# Removing As, Nb, Rh, Ag and Pb as they have almost no viable measurement values
MB_all <- MB_all %>%
select(-As) %>%
select(-Nb) %>%
select(-Rh) %>%
select(-Ag) %>%
select(-Pb)
summary(MB_nona)
## Area Type Ti Mn Fe
## new :11 mudbrick:11 Min. :0.0969 Min. :0.01190 Min. :0.9617
## old : 0 soil : 0 1st Qu.:0.1663 1st Qu.:0.02397 1st Qu.:1.6270
## soil: 0 Median :0.1915 Median :0.03347 Median :2.1312
## Mean :0.1780 Mean :0.03070 Mean :1.8912
## 3rd Qu.:0.1941 3rd Qu.:0.03895 3rd Qu.:2.2622
## Max. :0.2295 Max. :0.04177 Max. :2.4481
## Ni Cu Zr
## Min. :0.001400 Min. :0.004500 Min. :0.005400
## 1st Qu.:0.002317 1st Qu.:0.005533 1st Qu.:0.007767
## Median :0.003000 Median :0.006700 Median :0.008733
## Mean :0.002812 Mean :0.007333 Mean :0.008494
## 3rd Qu.:0.003367 3rd Qu.:0.008717 3rd Qu.:0.009100
## Max. :0.003967 Max. :0.011333 Max. :0.011833
summary(MB_all)
## Area Type MgO Al2O3 SiO2
## new :11 mudbrick:11 Min. :2.919 Min. :2.182 Min. : 9.164
## old : 0 soil : 0 1st Qu.:3.531 1st Qu.:3.403 1st Qu.:13.366
## soil: 0 Median :4.088 Median :4.317 Median :17.705
## Mean :3.999 Mean :3.941 Mean :15.874
## 3rd Qu.:4.400 3rd Qu.:4.593 3rd Qu.:18.560
## Max. :5.125 Max. :5.270 Max. :20.334
##
## S Cl K2O CaO
## Min. :0.1622 Min. :0.02723 Min. :0.1980 Min. :11.28
## 1st Qu.:0.2737 1st Qu.:0.06783 1st Qu.:0.3920 1st Qu.:18.46
## Median :0.3221 Median :0.09397 Median :0.4376 Median :20.74
## Mean :0.3550 Mean :0.20759 Mean :0.4312 Mean :19.67
## 3rd Qu.:0.3809 3rd Qu.:0.23782 3rd Qu.:0.4757 3rd Qu.:22.12
## Max. :0.7460 Max. :0.93440 Max. :0.6052 Max. :25.59
## NA's :1
## Ti V Mn Fe
## Min. :0.0969 Min. :0.002667 Min. :0.01190 Min. :0.9617
## 1st Qu.:0.1663 1st Qu.:0.004317 1st Qu.:0.02397 1st Qu.:1.6270
## Median :0.1915 Median :0.004750 Median :0.03347 Median :2.1312
## Mean :0.1780 Mean :0.004721 Mean :0.03070 Mean :1.8912
## 3rd Qu.:0.1941 3rd Qu.:0.005633 3rd Qu.:0.03895 3rd Qu.:2.2622
## Max. :0.2295 Max. :0.005867 Max. :0.04177 Max. :2.4481
## NA's :3
## Ni Cu Zn Rb
## Min. :0.001400 Min. :0.004500 Min. :0.001667 Min. :0.001200
## 1st Qu.:0.002317 1st Qu.:0.005533 1st Qu.:0.002467 1st Qu.:0.001617
## Median :0.003000 Median :0.006700 Median :0.003233 Median :0.001767
## Mean :0.002812 Mean :0.007333 Mean :0.003170 Mean :0.001750
## 3rd Qu.:0.003367 3rd Qu.:0.008717 3rd Qu.:0.003800 3rd Qu.:0.001925
## Max. :0.003967 Max. :0.011333 Max. :0.005000 Max. :0.002133
## NA's :1
## Sr Y Zr
## Min. :0.01790 Min. :0.0008667 Min. :0.005400
## 1st Qu.:0.02893 1st Qu.:0.0013333 1st Qu.:0.007767
## Median :0.03507 Median :0.0014000 Median :0.008733
## Mean :0.03438 Mean :0.0013741 Mean :0.008494
## 3rd Qu.:0.03835 3rd Qu.:0.0014667 3rd Qu.:0.009100
## Max. :0.04840 Max. :0.0017333 Max. :0.011833
## NA's :2
Elemental correlations in new mudbrick samples
# Visualize correlations between elements
cor(MB_all[3:20]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
K-means cluster analysis
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(MB_nona[3:8], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(MB_nona[3:8], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(MB_nona[3:8], mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements
# PCA for only non-NA elements
colnames(MB_nona)
## [1] "Area" "Type" "Ti" "Mn" "Fe" "Ni" "Cu" "Zr"
pca_1 <- prcomp(MB_nona[3:8], scale = TRUE, center = TRUE)
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0136 1.0789 0.70395 0.45806 0.25323 0.10779
## Proportion of Variance 0.6758 0.1940 0.08259 0.03497 0.01069 0.00194
## Cumulative Proportion 0.6758 0.8698 0.95241 0.98738 0.99806 1.00000
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_1, data=MB_nona, colour="red", shape = FALSE, label = TRUE, main = "PCA Palaepaphos 1: no NA:s at all")
PCA(MB_nona[3:8])
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
PCA with all the feasible elements
# NA:s converted to zeros
MB_all[is.na(MB_all)] <- 0
colnames(MB_all)
## [1] "Area" "Type" "MgO" "Al2O3" "SiO2" "S" "Cl" "K2O" "CaO"
## [10] "Ti" "V" "Mn" "Fe" "Ni" "Cu" "Zn" "Rb" "Sr"
## [19] "Y" "Zr"
pca_2 <- prcomp((MB_all[3:20]), scale = TRUE, center = TRUE)
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.1620 1.8251 1.26077 1.10902 0.8950 0.63845 0.5612
## Proportion of Variance 0.5554 0.1851 0.08831 0.06833 0.0445 0.02265 0.0175
## Cumulative Proportion 0.5554 0.7405 0.82882 0.89715 0.9416 0.96429 0.9818
## PC8 PC9 PC10 PC11
## Standard deviation 0.4388 0.34174 0.13583 2.812e-16
## Proportion of Variance 0.0107 0.00649 0.00103 0.000e+00
## Cumulative Proportion 0.9925 0.99897 1.00000 1.000e+00
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_2, data=MB_all, colour="red", shape = FALSE, label = TRUE, main = "PCA Palaepaphos 2: NA:s permitted")
PCA(MB_all[3:20])
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 18 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
PCA with all the Mudbrick samples (including the previously published ones): No NA values permitted
# PCA for only non-NA elements
colnames(pxrf_no_na)
## [1] "Area" "Type" "Ti" "Mn" "Fe" "Ni" "Cu" "Zr"
pca_3 <- prcomp(na.omit(pxrf_no_na[3:6]), scale = TRUE, center = TRUE)
summary(pca_3)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.6514 0.9995 0.41614 0.31724
## Proportion of Variance 0.6818 0.2498 0.04329 0.02516
## Cumulative Proportion 0.6818 0.9315 0.97484 1.00000
biplot(pca_3, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_3, data=pxrf_no_na, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by area")
autoplot(pca_3, data=pxrf_no_na, colour='Type', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by sample type")
PCA with all the Mudbrick samples (including the previously published ones): NA values are permitted
# NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
colnames(pxrf_all)
## [1] "Area" "Type" "MgO" "Al2O3" "SiO2" "S" "Cl" "K2O" "CaO"
## [10] "Ti" "V" "Mn" "Fe" "Ni" "Cu" "Zn" "As" "Rb"
## [19] "Sr" "Y" "Zr" "Nb" "Rh" "Ag" "Pb"
pca_4 <- prcomp(pxrf_all[3:25], scale = TRUE, center = TRUE)
summary(pca_4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.3748 1.8765 1.50629 1.13013 1.04455 0.89816 0.83735
## Proportion of Variance 0.4952 0.1531 0.09865 0.05553 0.04744 0.03507 0.03049
## Cumulative Proportion 0.4952 0.6483 0.74692 0.80245 0.84989 0.88497 0.91545
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.77794 0.66209 0.53427 0.40495 0.35837 0.33354 0.27679
## Proportion of Variance 0.02631 0.01906 0.01241 0.00713 0.00558 0.00484 0.00333
## Cumulative Proportion 0.94176 0.96082 0.97323 0.98036 0.98595 0.99078 0.99412
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.22788 0.17584 0.14605 0.12032 0.10296 0.06388 0.04325
## Proportion of Variance 0.00226 0.00134 0.00093 0.00063 0.00046 0.00018 0.00008
## Cumulative Proportion 0.99637 0.99772 0.99865 0.99927 0.99974 0.99991 0.99999
## PC22 PC23
## Standard deviation 0.01157 2.376e-16
## Proportion of Variance 0.00001 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
biplot(pca_4, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_4, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by area")
autoplot(pca_4, data=pxrf_all, colour='Type', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by sample type")
HCA analysis
# HCA
# compute divisive hierarchical clustering
hc4 <- diana(pxrf_all[3:25])
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.9618308
# Diana dendrogram
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")
# HCA dendrogam 1
dend <-
pxrf_all[3:25] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=5) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=5) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward5 <- hclust(dist(pxrf_all[3:25], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read and filter data
particle <- read.xlsx("data/grain_PP.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "PP_grain2"
## write.xlsx(particle, file="data/granulometry/PP_grain2.xlsx")
# Manually added Type and Context columns to PP_grain2 data: "PP_grain3"
grain3 <- read.xlsx("data/granulometry/PP_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
MB_grain <- grain[c(9:19),]
MB_context <- grain3[c(9:19),]
ggtern(data=MB_grain, aes(x=Clay, y=Sand, z=Silt, color=MB_context$Locus)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(MB_grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(ggdendro) # dendrograms together with ggplot
library(dendextend) # dendrograms - change parameters
library(tidyverse) # rectangles around HClust
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(ggdendro) # dendrograms together with ggplot
library(dendextend) # dendrograms - change parameters
library(tidyverse) # rectangles around HClust
Intro about the site
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read the raw data
pxrf <- read.xlsx("data/pxrf_AA.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2",
"K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
glimpse(pxrf_no_na)
## Rows: 10
## Columns: 17
## $ Area <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2 <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ Mn <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ Rb <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~
pxrf_all:
glimpse(pxrf_all)
## Rows: 10
## Columns: 21
## $ Area <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2 <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ V <dbl> 0.02741905, 0.56361382, NA, NA, -0.43565825, -0.19193335, -0.460~
## $ Cr <dbl> NA, NA, NA, NA, -0.6861787, 0.2217828, NA, NA, NA, NA
## $ Mn <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ As <dbl> 1.10179138, -0.87461790, NA, -0.05111403, 0.27828751, 0.44298829~
## $ Rb <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~
## $ Nb <dbl> 0.93500042, -0.65796326, NA, -0.06060188, -0.06060188, -0.657963~
Introduction
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Visualize correlations between elements
cor(pxrf_1[3:15]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements
# PCA
pca_1 <- prcomp(pxrf_1[3:15])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4147 1.0889 0.9306 0.48309 0.37987 0.31662 0.24466
## Proportion of Variance 0.6901 0.1403 0.1025 0.02762 0.01708 0.01186 0.00708
## Cumulative Proportion 0.6901 0.8305 0.9330 0.96059 0.97767 0.98953 0.99662
## PC8 PC9 PC10
## Standard deviation 0.15630 0.06455 7.266e-17
## Proportion of Variance 0.00289 0.00049 0.000e+00
## Cumulative Proportion 0.99951 1.00000 1.000e+00
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA grouped by area")
PCA with all the feasible elements
# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:21])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6421 1.2026 0.98838 0.84185 0.45012 0.39002 0.30457
## Proportion of Variance 0.6529 0.1353 0.09136 0.06628 0.01895 0.01423 0.00868
## Cumulative Proportion 0.6529 0.7881 0.87946 0.94574 0.96469 0.97891 0.98759
## PC8 PC9 PC10
## Standard deviation 0.28180 0.23084 2.238e-16
## Proportion of Variance 0.00743 0.00498 0.000e+00
## Cumulative Proportion 0.99502 1.00000 1.000e+00
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")
autoplot(pca_2, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA more elements grouped by area")
HCA
# HCA
# HCA dendrogam 1
dend <-
pxrf_1[3:12] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=4) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=4) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward4 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=4)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AA.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/granulometry/AA_grain2.xlsx")
# Manually added Type and Context columns to AA_grain2 data: "AA_grain3"
grain3 <- read.xlsx("data/granulometry/AA_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots